Needleman- Wunsch Algorithm
The Needleman-Wunsch Algorithm is a dynamic programming-based technique used for performing global sequence alignment between two biological sequences, such as DNA, RNA, or protein sequences. Developed by Saul B. Needleman and Christian D. Wunsch in 1970, this algorithm aims to identify the best possible alignment between two sequences by maximizing the similarity score, which is calculated based on a scoring system that rewards matches and penalizes mismatches and gaps.
The algorithm works by constructing a scoring matrix, where each cell represents the alignment score for a pair of characters from the input sequences. It starts by initializing the first row and column of the matrix with gap penalties and then iteratively fills the remaining cells by choosing the maximum score from three possible directions: diagonal (match/mismatch), horizontal (gap in sequence A), or vertical (gap in sequence B). Once the matrix is complete, the optimal alignment is obtained by performing a traceback operation, starting from the bottom-right corner of the matrix and moving through the highest scoring path towards the top-left corner. The Needleman-Wunsch Algorithm is widely used in computational biology and bioinformatics for comparing homologous sequences, identifying conserved regions, and predicting evolutionary relationships.
/*
Petar 'PetarV' Velickovic
Algorithm: Needleman-Wunsch
*/
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <iostream>
#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include <queue>
#include <stack>
#include <set>
#include <map>
#include <complex>
#define MAX_N 1001
#define DPRINTC(C) printf(#C " = %c\n", (C))
#define DPRINTS(S) printf(#S " = %s\n", (S))
#define DPRINTD(D) printf(#D " = %d\n", (D))
#define DPRINTLLD(LLD) printf(#LLD " = %lld\n", (LLD))
#define DPRINTLF(LF) printf(#LF " = %.5lf\n", (LF))
using namespace std;
typedef long long lld;
typedef unsigned long long llu;
int n, m;
int match_score, mismatch_score, gap_score;
string A, B;
int dp[MAX_N][MAX_N];
/*
Needleman-Wunsch algorithm for determining the optimal alignment between two strings
assuming a given score for hits, gaps and mismatches.
Complexity: O(n * m) time, O(n * m) memory
*/
inline int needleman_wunsch()
{
for (int i=0;i<=n;i++) dp[i][0] = dp[0][i] = -i * gap_score;
for (int i=1;i<=n;i++)
{
for (int j=1;j<=m;j++)
{
int S = (A[i-1] == B[j-1]) ? match_score : -mismatch_score;
dp[i][j] = max(dp[i-1][j-1] + S, max(dp[i-1][j] - gap_score, dp[i][j-1] - gap_score));
}
}
return dp[n][m];
}
inline pair<string, string> get_optimal_alignment()
{
string retA, retB;
stack<char> SA, SB;
int ii = n, jj = m;
while (ii != 0 || jj != 0)
{
if (ii == 0)
{
SA.push('-');
SB.push(B[jj-1]);
jj--;
}
else if (jj == 0)
{
SA.push(A[ii-1]);
SB.push('-');
ii--;
}
else
{
int S = (A[ii-1] == B[jj-1]) ? match_score : -mismatch_score;
if (dp[ii][jj] == dp[ii-1][jj-1] + S)
{
SA.push(A[ii-1]);
SB.push(B[jj-1]);
ii--; jj--;
}
else if (dp[ii-1][jj] > dp[ii][jj-1])
{
SA.push(A[ii-1]);
SB.push('-');
ii--;
}
else
{
SA.push('-');
SB.push(B[jj-1]);
jj--;
}
}
}
while (!SA.empty())
{
retA += SA.top();
retB += SB.top();
SA.pop();
SB.pop();
}
return make_pair(retA, retB);
}
int main()
{
n = 5, m = 6;
match_score = 2, mismatch_score = 1, gap_score = 1;
A = "CATGT";
B = "ACGCTG";
printf("%d\n",needleman_wunsch());
pair<string, string> alignment = get_optimal_alignment();
printf("%s\n%s\n", alignment.first.c_str(), alignment.second.c_str());
return 0;
}